package com.thundern.tdcrawler.service.task.pageparser;

import java.util.List;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import com.thundern.tdcrawler.model.Page;
import com.thundern.tdcrawler.model.Site;

@Component("oschinaBlogPageParser")
public class OschinaBlogPageParser extends AbstractPageParser {

	@Autowired
	@Qualifier("oschinaBlogSite")
	private Site site;
	
	@Value("#{p_regex['oschinablog.links']}")
	private String regexLinks;
	
	@Value("#{p_regex['oschinablog.title']}")	
	private String regexTitle;
	
	@Value("#{p_regex['oschinablog.tags']}")
	private String regexTags;
	
	@Override
	public void parsePage(Page page) throws NullPointerException{
		List<String> links = page.getHtml().links().regex(regexLinks).all();
        page.addTargetRequests(links);
        page.putField("title", page.getHtml().xpath(regexTitle).toString());
        if (page.getResultItems().get("title") == null) {
            //skip this page
            page.setSkip(true);
        }
        page.putField("content", page.getHtml().smartContent().toString());
        page.putField("tags", page.getHtml().xpath(regexTags).all());			
	}

	public Site getSite() {
		return site;
	}

}
